In [49]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import plotly.io as pio
pio.renderers.default = 'notebook'
In [50]:
df = pd.read_csv('DajuBhaiMinimart.csv')
df
Out[50]:
| CustomerID | CustomerName | Quantity | Price (NPR) | Sales (NPR) | Channel | Ratings | ProductCategory | PaymentMethod | TransactionDate | StoreLocation | DiscountApplied (NPR) | LoyaltyMember | CustomerContact | Feedback | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CUST001 | Bishal Gurung | 11 | 1303 | 17199 | Website | 4 | Electronics | Card | 12/6/2023 5:13 | - | 60 | Yes | 9819894305 | Quick delivery | Male |
| 1 | CUST002 | Sita Tamang | 10 | 610 | 7246 | Market Visit | 4 | Clothing | Mobile Payment | 12/22/2023 12:54 | Pokhara | 240 | Yes | 9823584905 | Satisfied with quality | Female |
| 2 | CUST003 | Sarita Lama | 7 | 1812 | 12684 | Market Visit | 3 | Electronics | Mobile Payment | 10/13/2024 1:03 | Lalitpur | 200 | No | 9823088823 | Friendly staff | Female |
| 3 | CUST004 | Ram Shrestha | 3 | 1070 | 3210 | Website | 1 | Electronics | Mobile Payment | 6/5/2024 6:08 | - | 100 | No | 9888996168 | Quick delivery | Male |
| 4 | CUST005 | Sita Tamang | 10 | 1209 | 14362 | Market Visit | 5 | Personal Care | Card | 10/26/2024 16:28 | Bhaktapur | 60 | Yes | 9874939195 | Prices a bit high | Female |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 295 | CUST296 | Kamal Sharma | 6 | 919 | 5514 | Website | 2 | Personal Care | Mobile Payment | 4/23/2024 5:18 | - | 50 | No | 9835949686 | Satisfied with quality | Male |
| 296 | CUST297 | Asha Rai | 7 | 685 | 4795 | Market Visit | 3 | Groceries | Mobile Payment | 10/5/2024 5:05 | Lalitpur | 0 | No | 9869059335 | Satisfied with quality | Male |
| 297 | CUST298 | Kamal Sharma | 8 | 342 | 2736 | Market Visit | 1 | Electronics | Card | 12/1/2023 8:10 | Pokhara | 200 | No | 9866858364 | Prices a bit high | Male |
| 298 | CUST299 | Gurax Gurung | 11 | 217 | 2864 | Market Visit | 4 | Household | Cash | 1/21/2024 2:00 | Pokhara | 60 | Yes | 9816458335 | Friendly staff | Female |
| 299 | CUST300 | Sarita Lama | 9 | 1644 | 17360 | Market Visit | 3 | Clothing | Cash | 11/12/2023 14:47 | Lalitpur | 0 | Yes | 9893314124 | Prices a bit high | Female |
300 rows × 16 columns
Data Cleaning¶
In [51]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 300 entries, 0 to 299 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 300 non-null object 1 CustomerName 300 non-null object 2 Quantity 300 non-null int64 3 Price (NPR) 300 non-null int64 4 Sales (NPR) 300 non-null int64 5 Channel 300 non-null object 6 Ratings 300 non-null int64 7 ProductCategory 300 non-null object 8 PaymentMethod 300 non-null object 9 TransactionDate 300 non-null object 10 StoreLocation 300 non-null object 11 DiscountApplied (NPR) 300 non-null int64 12 LoyaltyMember 300 non-null object 13 CustomerContact 300 non-null int64 14 Feedback 300 non-null object 15 Gender 300 non-null object dtypes: int64(6), object(10) memory usage: 37.6+ KB
In [52]:
df.isnull().sum()
Out[52]:
CustomerID 0 CustomerName 0 Quantity 0 Price (NPR) 0 Sales (NPR) 0 Channel 0 Ratings 0 ProductCategory 0 PaymentMethod 0 TransactionDate 0 StoreLocation 0 DiscountApplied (NPR) 0 LoyaltyMember 0 CustomerContact 0 Feedback 0 Gender 0 dtype: int64
In [53]:
# drop customer_id column
df.drop('CustomerID', axis=1, inplace=True)
In [54]:
#rename the column name
df = df.rename(columns={'Price (NPR)': 'Price'})
df = df.rename(columns={'Sales (NPR)': 'Sales'})
df = df.rename(columns={'DiscountApplied (NPR)': 'DiscountApplied'})
In [55]:
df
Out[55]:
| CustomerName | Quantity | Price | Sales | Channel | Ratings | ProductCategory | PaymentMethod | TransactionDate | StoreLocation | DiscountApplied | LoyaltyMember | CustomerContact | Feedback | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Bishal Gurung | 11 | 1303 | 17199 | Website | 4 | Electronics | Card | 12/6/2023 5:13 | - | 60 | Yes | 9819894305 | Quick delivery | Male |
| 1 | Sita Tamang | 10 | 610 | 7246 | Market Visit | 4 | Clothing | Mobile Payment | 12/22/2023 12:54 | Pokhara | 240 | Yes | 9823584905 | Satisfied with quality | Female |
| 2 | Sarita Lama | 7 | 1812 | 12684 | Market Visit | 3 | Electronics | Mobile Payment | 10/13/2024 1:03 | Lalitpur | 200 | No | 9823088823 | Friendly staff | Female |
| 3 | Ram Shrestha | 3 | 1070 | 3210 | Website | 1 | Electronics | Mobile Payment | 6/5/2024 6:08 | - | 100 | No | 9888996168 | Quick delivery | Male |
| 4 | Sita Tamang | 10 | 1209 | 14362 | Market Visit | 5 | Personal Care | Card | 10/26/2024 16:28 | Bhaktapur | 60 | Yes | 9874939195 | Prices a bit high | Female |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 295 | Kamal Sharma | 6 | 919 | 5514 | Website | 2 | Personal Care | Mobile Payment | 4/23/2024 5:18 | - | 50 | No | 9835949686 | Satisfied with quality | Male |
| 296 | Asha Rai | 7 | 685 | 4795 | Market Visit | 3 | Groceries | Mobile Payment | 10/5/2024 5:05 | Lalitpur | 0 | No | 9869059335 | Satisfied with quality | Male |
| 297 | Kamal Sharma | 8 | 342 | 2736 | Market Visit | 1 | Electronics | Card | 12/1/2023 8:10 | Pokhara | 200 | No | 9866858364 | Prices a bit high | Male |
| 298 | Gurax Gurung | 11 | 217 | 2864 | Market Visit | 4 | Household | Cash | 1/21/2024 2:00 | Pokhara | 60 | Yes | 9816458335 | Friendly staff | Female |
| 299 | Sarita Lama | 9 | 1644 | 17360 | Market Visit | 3 | Clothing | Cash | 11/12/2023 14:47 | Lalitpur | 0 | Yes | 9893314124 | Prices a bit high | Female |
300 rows × 15 columns
In [56]:
df.head()
Out[56]:
| CustomerName | Quantity | Price | Sales | Channel | Ratings | ProductCategory | PaymentMethod | TransactionDate | StoreLocation | DiscountApplied | LoyaltyMember | CustomerContact | Feedback | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Bishal Gurung | 11 | 1303 | 17199 | Website | 4 | Electronics | Card | 12/6/2023 5:13 | - | 60 | Yes | 9819894305 | Quick delivery | Male |
| 1 | Sita Tamang | 10 | 610 | 7246 | Market Visit | 4 | Clothing | Mobile Payment | 12/22/2023 12:54 | Pokhara | 240 | Yes | 9823584905 | Satisfied with quality | Female |
| 2 | Sarita Lama | 7 | 1812 | 12684 | Market Visit | 3 | Electronics | Mobile Payment | 10/13/2024 1:03 | Lalitpur | 200 | No | 9823088823 | Friendly staff | Female |
| 3 | Ram Shrestha | 3 | 1070 | 3210 | Website | 1 | Electronics | Mobile Payment | 6/5/2024 6:08 | - | 100 | No | 9888996168 | Quick delivery | Male |
| 4 | Sita Tamang | 10 | 1209 | 14362 | Market Visit | 5 | Personal Care | Card | 10/26/2024 16:28 | Bhaktapur | 60 | Yes | 9874939195 | Prices a bit high | Female |
In [57]:
df['ProductCategory'].unique()
Out[57]:
array(['Electronics', 'Clothing', 'Personal Care', 'Groceries',
'Household'], dtype=object)
In [58]:
df
Out[58]:
| CustomerName | Quantity | Price | Sales | Channel | Ratings | ProductCategory | PaymentMethod | TransactionDate | StoreLocation | DiscountApplied | LoyaltyMember | CustomerContact | Feedback | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Bishal Gurung | 11 | 1303 | 17199 | Website | 4 | Electronics | Card | 12/6/2023 5:13 | - | 60 | Yes | 9819894305 | Quick delivery | Male |
| 1 | Sita Tamang | 10 | 610 | 7246 | Market Visit | 4 | Clothing | Mobile Payment | 12/22/2023 12:54 | Pokhara | 240 | Yes | 9823584905 | Satisfied with quality | Female |
| 2 | Sarita Lama | 7 | 1812 | 12684 | Market Visit | 3 | Electronics | Mobile Payment | 10/13/2024 1:03 | Lalitpur | 200 | No | 9823088823 | Friendly staff | Female |
| 3 | Ram Shrestha | 3 | 1070 | 3210 | Website | 1 | Electronics | Mobile Payment | 6/5/2024 6:08 | - | 100 | No | 9888996168 | Quick delivery | Male |
| 4 | Sita Tamang | 10 | 1209 | 14362 | Market Visit | 5 | Personal Care | Card | 10/26/2024 16:28 | Bhaktapur | 60 | Yes | 9874939195 | Prices a bit high | Female |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 295 | Kamal Sharma | 6 | 919 | 5514 | Website | 2 | Personal Care | Mobile Payment | 4/23/2024 5:18 | - | 50 | No | 9835949686 | Satisfied with quality | Male |
| 296 | Asha Rai | 7 | 685 | 4795 | Market Visit | 3 | Groceries | Mobile Payment | 10/5/2024 5:05 | Lalitpur | 0 | No | 9869059335 | Satisfied with quality | Male |
| 297 | Kamal Sharma | 8 | 342 | 2736 | Market Visit | 1 | Electronics | Card | 12/1/2023 8:10 | Pokhara | 200 | No | 9866858364 | Prices a bit high | Male |
| 298 | Gurax Gurung | 11 | 217 | 2864 | Market Visit | 4 | Household | Cash | 1/21/2024 2:00 | Pokhara | 60 | Yes | 9816458335 | Friendly staff | Female |
| 299 | Sarita Lama | 9 | 1644 | 17360 | Market Visit | 3 | Clothing | Cash | 11/12/2023 14:47 | Lalitpur | 0 | Yes | 9893314124 | Prices a bit high | Female |
300 rows × 15 columns
In [59]:
# Calculate additional columns
df['TotalDiscount'] = df['DiscountApplied'] * df['Quantity']
df['CustomerFrequency'] = df.groupby('CustomerName')['CustomerName'].transform('count')
df['AverageRatingPerCustomer'] = df.groupby('CustomerName')['Ratings'].transform('mean')
df['TotalSalesPerCustomer'] = df.groupby('CustomerName')['Sales'].transform('sum')
# Extract date features before dropping TransactionDate
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])
df['IsWeekend'] = df['TransactionDate'].dt.dayofweek >= 5
df['IsWeekend'] = df['IsWeekend'].astype(int)
In [60]:
# Extract date features
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])
df['Year'] = df['TransactionDate'].dt.year
df['Month'] = df['TransactionDate'].dt.month
df['Day'] = df['TransactionDate'].dt.day
df['DayOfWeek'] = df['TransactionDate'].dt.dayofweek
In [61]:
# Map day of the week numbers to names
day_of_week_map = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
df['DayOfWeek'] = df['DayOfWeek'].map(day_of_week_map)
In [62]:
# Drop unnecessary columns
df.drop(columns=['CustomerName', 'CustomerContact', 'Feedback'], inplace=True)
In [63]:
df
Out[63]:
| Quantity | Price | Sales | Channel | Ratings | ProductCategory | PaymentMethod | TransactionDate | StoreLocation | DiscountApplied | ... | Gender | TotalDiscount | CustomerFrequency | AverageRatingPerCustomer | TotalSalesPerCustomer | IsWeekend | Year | Month | Day | DayOfWeek | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11 | 1303 | 17199 | Website | 4 | Electronics | Card | 2023-12-06 05:13:00 | - | 60 | ... | Male | 660 | 49 | 3.693878 | 371058 | 0 | 2023 | 12 | 6 | Wednesday |
| 1 | 10 | 610 | 7246 | Market Visit | 4 | Clothing | Mobile Payment | 2023-12-22 12:54:00 | Pokhara | 240 | ... | Female | 2400 | 47 | 3.361702 | 341186 | 0 | 2023 | 12 | 22 | Friday |
| 2 | 7 | 1812 | 12684 | Market Visit | 3 | Electronics | Mobile Payment | 2024-10-13 01:03:00 | Lalitpur | 200 | ... | Female | 1400 | 42 | 3.380952 | 278411 | 1 | 2024 | 10 | 13 | Sunday |
| 3 | 3 | 1070 | 3210 | Website | 1 | Electronics | Mobile Payment | 2024-06-05 06:08:00 | - | 100 | ... | Male | 300 | 43 | 3.372093 | 211555 | 0 | 2024 | 6 | 5 | Wednesday |
| 4 | 10 | 1209 | 14362 | Market Visit | 5 | Personal Care | Card | 2024-10-26 16:28:00 | Bhaktapur | 60 | ... | Female | 600 | 47 | 3.361702 | 341186 | 1 | 2024 | 10 | 26 | Saturday |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 295 | 6 | 919 | 5514 | Website | 2 | Personal Care | Mobile Payment | 2024-04-23 05:18:00 | - | 50 | ... | Male | 300 | 47 | 3.531915 | 383490 | 0 | 2024 | 4 | 23 | Tuesday |
| 296 | 7 | 685 | 4795 | Market Visit | 3 | Groceries | Mobile Payment | 2024-10-05 05:05:00 | Lalitpur | 0 | ... | Male | 0 | 36 | 3.361111 | 243614 | 1 | 2024 | 10 | 5 | Saturday |
| 297 | 8 | 342 | 2736 | Market Visit | 1 | Electronics | Card | 2023-12-01 08:10:00 | Pokhara | 200 | ... | Male | 1600 | 47 | 3.531915 | 383490 | 0 | 2023 | 12 | 1 | Friday |
| 298 | 11 | 217 | 2864 | Market Visit | 4 | Household | Cash | 2024-01-21 02:00:00 | Pokhara | 60 | ... | Female | 660 | 1 | 4.000000 | 2864 | 1 | 2024 | 1 | 21 | Sunday |
| 299 | 9 | 1644 | 17360 | Market Visit | 3 | Clothing | Cash | 2023-11-12 14:47:00 | Lalitpur | 0 | ... | Female | 0 | 42 | 3.380952 | 278411 | 1 | 2023 | 11 | 12 | Sunday |
300 rows × 21 columns
Data Visualization¶
In [64]:
# Group by Gender and count the number of visits
gender_distribution = df['Gender'].value_counts().reset_index()
gender_distribution.columns = ['Gender', 'Count']
# Create a pie chart
fig = px.pie(gender_distribution,
values='Count',
names='Gender',
title='Gender Distribution of Store Visitors')
fig.update_layout(width=800, height=600)
fig.show()
In [65]:
# visualizations based on Total Sales by Product Category plotly
total_sales_by_product_category = df.groupby('ProductCategory')['Sales'].sum().reset_index()
total_sales_by_product_category = total_sales_by_product_category.sort_values(by='Sales', ascending=False)
#bar
fig = px.bar(total_sales_by_product_category, x='ProductCategory', y='Sales',color='ProductCategory', title='Total Sales by Product Category')
fig.update_layout(xaxis_title="Product Category", yaxis_title="Total Sales")
fig.show()
In [66]:
# visualization based average rating by product category
average_rating_by_product_category = df.groupby('ProductCategory')['Ratings'].mean().reset_index()
average_rating_by_product_category = average_rating_by_product_category.sort_values(by='Ratings', ascending=False)
#pie chart
fig = px.pie(average_rating_by_product_category, names='ProductCategory', values='Ratings', title='Average Rating by Product Category')
fig.show()
In [67]:
#total Sales by payment method
total_sales_by_payment_method = df.groupby('PaymentMethod')['Sales'].sum().reset_index()
total_sales_by_payment_method = total_sales_by_payment_method.sort_values(by='Sales', ascending=False)
# histogram
fig = px.histogram(df, x='PaymentMethod', y='Sales', color='PaymentMethod', title='Total Sales by Payment Method', histfunc='sum')
fig.update_layout(xaxis_title="Payment Method", yaxis_title="Total Sales")
fig.show()
In [68]:
#total sales by channel
total_sales_by_channel = df.groupby('Channel')['Sales'].sum().reset_index()
total_sales_by_channel = total_sales_by_channel.sort_values(by='Sales', ascending=False)
#bar plotly
fig = px.bar(total_sales_by_channel, x='Channel', y='Sales',color='Sales',color_continuous_scale='temps', title='TotalSales by Channel')
fig.update_layout(xaxis_title="Channel", yaxis_title="Total Sales")
fig.show()
In [69]:
#total sales by day of the week
total_sales_by_day = df.groupby('DayOfWeek')['Sales'].sum().reset_index()
days_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
total_sales_by_day['DayOfWeek'] = pd.Categorical(total_sales_by_day['DayOfWeek'], categories=days_order, ordered=True)
total_sales_by_day = total_sales_by_day.sort_values('DayOfWeek')
fig = px.bar(total_sales_by_day, x='DayOfWeek', y='Sales', title='Total Sales by Day of the Week', color='Sales', color_continuous_scale='temps')
fig.update_layout(xaxis_title="Day of the Week", yaxis_title="Total Sales (NPR)")
fig.show()
In [70]:
#visualizations based on Total Sales by Month
total_sales_by_month = df.groupby('Month')['Sales'].sum().reset_index()
fig = px.scatter(total_sales_by_month, x='Month', y='Sales', title='Total Sales by Month',color='Sales',color_continuous_scale='matter')
fig.show()
In [71]:
# Calculate average discount applied by product category
average_discount_by_category = df.groupby('ProductCategory')['DiscountApplied'].mean().reset_index()
# Create a line graph
fig = px.line(average_discount_by_category, x='ProductCategory', y='DiscountApplied', title='Average Discount Applied by Product Category')
fig.update_layout(xaxis_title="Product Category", yaxis_title="Average Discount Applied")
fig.show()
In [72]:
#total sales by gender
sales_by_gender = df.groupby('Gender')['Sales'].sum().reset_index()
#pie
fig = px.pie(sales_by_gender, values='Sales', names='Gender', title='Total Sales by Gender',color='Gender')
fig.show()
In [73]:
# Visualize the total quantity sold by product category
quantity_by_category = df.groupby('ProductCategory')['Quantity'].sum().reset_index()
fig = px.bar(quantity_by_category,
x='ProductCategory',
y='Quantity',
title='Total Quantity Sold by Product Category',
color='Quantity',
color_continuous_scale='matter',
)
# Update the layout to make the plot bigger
fig.update_layout(width=800, height=600)
fig.show()
Feature Engineering¶
In [74]:
#loyality member
df['LoyaltyMember'] = df['LoyaltyMember'].map({
'Yes': '1',
'No': '0'
})
In [75]:
#product Category mapping
df['ProductCategory'] = df['ProductCategory'].map({
'Electronics': '0',
'Clothing': '1',
'Groceries': '2',
'Personal Care': '3',
'Household': '4'
})
In [76]:
#mapping channel name
df['Channel'] = df['Channel'].map({
'Website': '0',
'Market Visit': '1'
})
df['Channel'] = pd.to_numeric(df['Channel'], errors='coerce')
In [77]:
# payment method mapping
df['PaymentMethod'] = df['PaymentMethod'].map({
'Card': '0',
'Mobile Payment': '1',
'Cash': '2'
})
df.head()
Out[77]:
| Quantity | Price | Sales | Channel | Ratings | ProductCategory | PaymentMethod | TransactionDate | StoreLocation | DiscountApplied | ... | Gender | TotalDiscount | CustomerFrequency | AverageRatingPerCustomer | TotalSalesPerCustomer | IsWeekend | Year | Month | Day | DayOfWeek | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11 | 1303 | 17199 | 0 | 4 | 0 | 0 | 2023-12-06 05:13:00 | - | 60 | ... | Male | 660 | 49 | 3.693878 | 371058 | 0 | 2023 | 12 | 6 | Wednesday |
| 1 | 10 | 610 | 7246 | 1 | 4 | 1 | 1 | 2023-12-22 12:54:00 | Pokhara | 240 | ... | Female | 2400 | 47 | 3.361702 | 341186 | 0 | 2023 | 12 | 22 | Friday |
| 2 | 7 | 1812 | 12684 | 1 | 3 | 0 | 1 | 2024-10-13 01:03:00 | Lalitpur | 200 | ... | Female | 1400 | 42 | 3.380952 | 278411 | 1 | 2024 | 10 | 13 | Sunday |
| 3 | 3 | 1070 | 3210 | 0 | 1 | 0 | 1 | 2024-06-05 06:08:00 | - | 100 | ... | Male | 300 | 43 | 3.372093 | 211555 | 0 | 2024 | 6 | 5 | Wednesday |
| 4 | 10 | 1209 | 14362 | 1 | 5 | 3 | 0 | 2024-10-26 16:28:00 | Bhaktapur | 60 | ... | Female | 600 | 47 | 3.361702 | 341186 | 1 | 2024 | 10 | 26 | Saturday |
5 rows × 21 columns
In [78]:
# Ratings Category (Low, Medium, High)
def categorize_rating(Ratings):
if Ratings <= 2.5:
return 'Low'
elif Ratings <= 4.0:
return 'Medium'
else:
return 'High'
df['Rating_Category'] = df['Ratings'].apply(categorize_rating)
df.head()
Out[78]:
| Quantity | Price | Sales | Channel | Ratings | ProductCategory | PaymentMethod | TransactionDate | StoreLocation | DiscountApplied | ... | TotalDiscount | CustomerFrequency | AverageRatingPerCustomer | TotalSalesPerCustomer | IsWeekend | Year | Month | Day | DayOfWeek | Rating_Category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11 | 1303 | 17199 | 0 | 4 | 0 | 0 | 2023-12-06 05:13:00 | - | 60 | ... | 660 | 49 | 3.693878 | 371058 | 0 | 2023 | 12 | 6 | Wednesday | Medium |
| 1 | 10 | 610 | 7246 | 1 | 4 | 1 | 1 | 2023-12-22 12:54:00 | Pokhara | 240 | ... | 2400 | 47 | 3.361702 | 341186 | 0 | 2023 | 12 | 22 | Friday | Medium |
| 2 | 7 | 1812 | 12684 | 1 | 3 | 0 | 1 | 2024-10-13 01:03:00 | Lalitpur | 200 | ... | 1400 | 42 | 3.380952 | 278411 | 1 | 2024 | 10 | 13 | Sunday | Medium |
| 3 | 3 | 1070 | 3210 | 0 | 1 | 0 | 1 | 2024-06-05 06:08:00 | - | 100 | ... | 300 | 43 | 3.372093 | 211555 | 0 | 2024 | 6 | 5 | Wednesday | Low |
| 4 | 10 | 1209 | 14362 | 1 | 5 | 3 | 0 | 2024-10-26 16:28:00 | Bhaktapur | 60 | ... | 600 | 47 | 3.361702 | 341186 | 1 | 2024 | 10 | 26 | Saturday | High |
5 rows × 22 columns
Model Training¶
In [79]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 300 entries, 0 to 299 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Quantity 300 non-null int64 1 Price 300 non-null int64 2 Sales 300 non-null int64 3 Channel 300 non-null int64 4 Ratings 300 non-null int64 5 ProductCategory 300 non-null object 6 PaymentMethod 300 non-null object 7 TransactionDate 300 non-null datetime64[ns] 8 StoreLocation 300 non-null object 9 DiscountApplied 300 non-null int64 10 LoyaltyMember 300 non-null object 11 Gender 300 non-null object 12 TotalDiscount 300 non-null int64 13 CustomerFrequency 300 non-null int64 14 AverageRatingPerCustomer 300 non-null float64 15 TotalSalesPerCustomer 300 non-null int64 16 IsWeekend 300 non-null int64 17 Year 300 non-null int32 18 Month 300 non-null int32 19 Day 300 non-null int32 20 DayOfWeek 300 non-null object 21 Rating_Category 300 non-null object dtypes: datetime64[ns](1), float64(1), int32(3), int64(10), object(7) memory usage: 48.2+ KB
In [80]:
#data to numeric to columns
df['ProductCategory'] = pd.to_numeric(df['ProductCategory'], errors='coerce')
df['PaymentMethod'] = pd.to_numeric(df['PaymentMethod'], errors='coerce')
df['LoyalityMember'] = pd.to_numeric(df['LoyaltyMember'], errors='coerce')
df.head()
Out[80]:
| Quantity | Price | Sales | Channel | Ratings | ProductCategory | PaymentMethod | TransactionDate | StoreLocation | DiscountApplied | ... | CustomerFrequency | AverageRatingPerCustomer | TotalSalesPerCustomer | IsWeekend | Year | Month | Day | DayOfWeek | Rating_Category | LoyalityMember | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11 | 1303 | 17199 | 0 | 4 | 0 | 0 | 2023-12-06 05:13:00 | - | 60 | ... | 49 | 3.693878 | 371058 | 0 | 2023 | 12 | 6 | Wednesday | Medium | 1 |
| 1 | 10 | 610 | 7246 | 1 | 4 | 1 | 1 | 2023-12-22 12:54:00 | Pokhara | 240 | ... | 47 | 3.361702 | 341186 | 0 | 2023 | 12 | 22 | Friday | Medium | 1 |
| 2 | 7 | 1812 | 12684 | 1 | 3 | 0 | 1 | 2024-10-13 01:03:00 | Lalitpur | 200 | ... | 42 | 3.380952 | 278411 | 1 | 2024 | 10 | 13 | Sunday | Medium | 0 |
| 3 | 3 | 1070 | 3210 | 0 | 1 | 0 | 1 | 2024-06-05 06:08:00 | - | 100 | ... | 43 | 3.372093 | 211555 | 0 | 2024 | 6 | 5 | Wednesday | Low | 0 |
| 4 | 10 | 1209 | 14362 | 1 | 5 | 3 | 0 | 2024-10-26 16:28:00 | Bhaktapur | 60 | ... | 47 | 3.361702 | 341186 | 1 | 2024 | 10 | 26 | Saturday | High | 1 |
5 rows × 23 columns
In [81]:
df.info()
df['Channel'].unique()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 300 entries, 0 to 299 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Quantity 300 non-null int64 1 Price 300 non-null int64 2 Sales 300 non-null int64 3 Channel 300 non-null int64 4 Ratings 300 non-null int64 5 ProductCategory 300 non-null int64 6 PaymentMethod 300 non-null int64 7 TransactionDate 300 non-null datetime64[ns] 8 StoreLocation 300 non-null object 9 DiscountApplied 300 non-null int64 10 LoyaltyMember 300 non-null object 11 Gender 300 non-null object 12 TotalDiscount 300 non-null int64 13 CustomerFrequency 300 non-null int64 14 AverageRatingPerCustomer 300 non-null float64 15 TotalSalesPerCustomer 300 non-null int64 16 IsWeekend 300 non-null int64 17 Year 300 non-null int32 18 Month 300 non-null int32 19 Day 300 non-null int32 20 DayOfWeek 300 non-null object 21 Rating_Category 300 non-null object 22 LoyalityMember 300 non-null int64 dtypes: datetime64[ns](1), float64(1), int32(3), int64(13), object(5) memory usage: 50.5+ KB
Out[81]:
array([0, 1])
In [82]:
df.describe()
Out[82]:
| Quantity | Price | Sales | Channel | Ratings | ProductCategory | PaymentMethod | TransactionDate | DiscountApplied | TotalDiscount | CustomerFrequency | AverageRatingPerCustomer | TotalSalesPerCustomer | IsWeekend | Year | Month | Day | LoyalityMember | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 300.000000 | 300.000000 | 300.000000 | 300.000000 | 300.000000 | 300.000000 | 300.000000 | 300 | 300.000000 | 300.000000 | 300.000000 | 300.000000 | 300.000000 | 300.000000 | 300.000000 | 300.000000 | 300.000000 | 300.000000 |
| mean | 6.126667 | 1046.113333 | 6848.303333 | 0.470000 | 3.473333 | 1.946667 | 0.946667 | 2024-05-14 01:22:03.600000 | 98.366667 | 603.300000 | 41.733333 | 3.473333 | 291138.376667 | 0.296667 | 2023.853333 | 6.673333 | 15.666667 | 0.490000 |
| min | 1.000000 | 107.000000 | 145.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 2023-11-10 23:24:00 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1071.000000 | 0.000000 | 2023.000000 | 1.000000 | 1.000000 | 0.000000 |
| 25% | 4.000000 | 532.000000 | 2413.250000 | 0.000000 | 2.000000 | 1.000000 | 0.000000 | 2024-02-07 20:28:45 | 50.000000 | 87.500000 | 42.000000 | 3.361702 | 211555.000000 | 0.000000 | 2024.000000 | 4.000000 | 7.750000 | 0.000000 |
| 50% | 6.000000 | 1086.000000 | 5165.500000 | 0.000000 | 4.000000 | 2.000000 | 1.000000 | 2024-05-19 12:17:00 | 100.000000 | 480.000000 | 43.000000 | 3.380952 | 278411.000000 | 0.000000 | 2024.000000 | 7.000000 | 16.000000 | 0.000000 |
| 75% | 9.000000 | 1524.250000 | 10034.000000 | 1.000000 | 5.000000 | 3.000000 | 2.000000 | 2024-08-11 11:19:00 | 120.000000 | 960.000000 | 47.000000 | 3.535714 | 371058.000000 | 1.000000 | 2024.000000 | 10.000000 | 24.000000 | 1.000000 |
| max | 11.000000 | 1998.000000 | 25819.000000 | 1.000000 | 5.000000 | 4.000000 | 2.000000 | 2024-11-04 18:03:00 | 240.000000 | 2640.000000 | 49.000000 | 5.000000 | 383490.000000 | 1.000000 | 2024.000000 | 12.000000 | 31.000000 | 1.000000 |
| std | 2.997874 | 554.192076 | 5634.298122 | 0.499933 | 1.332369 | 1.500442 | 0.824270 | NaN | 79.662744 | 593.775497 | 9.144815 | 0.235653 | 83860.974326 | 0.457552 | 0.354364 | 3.465061 | 9.047379 | 0.500735 |
In [83]:
corr = df.corr(numeric_only=1)
corr
Out[83]:
| Quantity | Price | Sales | Channel | Ratings | ProductCategory | PaymentMethod | DiscountApplied | TotalDiscount | CustomerFrequency | AverageRatingPerCustomer | TotalSalesPerCustomer | IsWeekend | Year | Month | Day | LoyalityMember | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Quantity | 1.000000 | 0.012748 | 0.668159 | 0.058332 | -0.079534 | 0.029761 | 0.036580 | 0.002690 | 0.482048 | 0.032955 | -0.058428 | 0.084254 | 0.001772 | -0.010788 | -0.024658 | -0.051461 | 0.183539 |
| Price | 0.012748 | 1.000000 | 0.647220 | -0.044857 | 0.068485 | -0.012232 | -0.035840 | -0.091466 | -0.086800 | 0.027917 | 0.109680 | 0.092431 | 0.022302 | -0.021935 | 0.045250 | -0.073137 | 0.003668 |
| Sales | 0.668159 | 0.647220 | 1.000000 | -0.018217 | 0.038459 | 0.004102 | 0.007956 | -0.073233 | 0.233778 | 0.103906 | 0.042509 | 0.166571 | 0.035740 | -0.033195 | 0.020542 | -0.113691 | 0.173722 |
| Channel | 0.058332 | -0.044857 | -0.018217 | 1.000000 | 0.026411 | 0.100407 | -0.028244 | 0.169659 | 0.133675 | 0.009217 | 0.011350 | 0.007948 | 0.104833 | -0.043798 | 0.061897 | 0.020704 | -0.001202 |
| Ratings | -0.079534 | 0.068485 | 0.038459 | 0.026411 | 1.000000 | 0.005978 | 0.044381 | 0.054888 | -0.027304 | -0.008820 | 0.176868 | 0.019365 | -0.011667 | -0.015395 | -0.032319 | -0.055120 | 0.307897 |
| ProductCategory | 0.029761 | -0.012232 | 0.004102 | 0.100407 | 0.005978 | 1.000000 | -0.018533 | 0.026690 | 0.054105 | -0.002746 | -0.063846 | -0.007227 | 0.076711 | 0.054431 | -0.022017 | -0.019053 | 0.030448 |
| PaymentMethod | 0.036580 | -0.035840 | 0.007956 | -0.028244 | 0.044381 | -0.018533 | 1.000000 | 0.018533 | -0.009548 | -0.079983 | 0.087827 | -0.029193 | 0.033225 | -0.015419 | -0.132586 | 0.063982 | 0.071631 |
| DiscountApplied | 0.002690 | -0.091466 | -0.073233 | 0.169659 | 0.054888 | 0.026690 | 0.018533 | 1.000000 | 0.787155 | 0.041269 | 0.027291 | 0.039173 | 0.060134 | 0.049538 | -0.106865 | 0.012745 | 0.150925 |
| TotalDiscount | 0.482048 | -0.086800 | 0.233778 | 0.133675 | -0.027304 | 0.054105 | -0.009548 | 0.787155 | 1.000000 | 0.069091 | 0.027262 | 0.091305 | 0.018420 | 0.034892 | -0.088651 | -0.000299 | 0.182845 |
| CustomerFrequency | 0.032955 | 0.027917 | 0.103906 | 0.009217 | -0.008820 | -0.002746 | -0.079983 | 0.041269 | 0.069091 | 1.000000 | -0.049870 | 0.867935 | 0.002984 | -0.058552 | 0.106904 | -0.011709 | 0.041777 |
| AverageRatingPerCustomer | -0.058428 | 0.109680 | 0.042509 | 0.011350 | 0.176868 | -0.063846 | 0.087827 | 0.027291 | 0.027262 | -0.049870 | 1.000000 | 0.109487 | -0.030067 | 0.015437 | -0.013972 | 0.086413 | 0.048151 |
| TotalSalesPerCustomer | 0.084254 | 0.092431 | 0.166571 | 0.007948 | 0.019365 | -0.007227 | -0.029193 | 0.039173 | 0.091305 | 0.867935 | 0.109487 | 1.000000 | -0.036349 | -0.017618 | 0.049690 | 0.008862 | 0.050110 |
| IsWeekend | 0.001772 | 0.022302 | 0.035740 | 0.104833 | -0.011667 | 0.076711 | 0.033225 | 0.060134 | 0.018420 | 0.002984 | -0.030067 | -0.036349 | 1.000000 | -0.060781 | 0.002264 | -0.025315 | 0.034888 |
| Year | -0.010788 | -0.021935 | -0.033195 | -0.043798 | -0.015395 | 0.054431 | -0.015419 | 0.049538 | 0.034892 | -0.058552 | 0.015437 | -0.017618 | -0.060781 | 1.000000 | -0.616585 | -0.065372 | 0.010555 |
| Month | -0.024658 | 0.045250 | 0.020542 | 0.061897 | -0.032319 | -0.022017 | -0.132586 | -0.106865 | -0.088651 | 0.106904 | -0.013972 | 0.049690 | 0.002264 | -0.616585 | 1.000000 | 0.048896 | -0.113688 |
| Day | -0.051461 | -0.073137 | -0.113691 | 0.020704 | -0.055120 | -0.019053 | 0.063982 | 0.012745 | -0.000299 | -0.011709 | 0.086413 | 0.008862 | -0.025315 | -0.065372 | 0.048896 | 1.000000 | -0.033221 |
| LoyalityMember | 0.183539 | 0.003668 | 0.173722 | -0.001202 | 0.307897 | 0.030448 | 0.071631 | 0.150925 | 0.182845 | 0.041777 | 0.048151 | 0.050110 | 0.034888 | 0.010555 | -0.113688 | -0.033221 | 1.000000 |
In [84]:
# Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='magma', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
In [85]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error,r2_score
Features= df[['Quantity', 'Price','ProductCategory', 'PaymentMethod', 'LoyaltyMember']]
Target = df['Sales']
X_train, X_test, y_train, y_test = train_test_split(Features, Target, test_size=0.2, random_state=42)
In [100]:
#gradient boosting regressor
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_absolute_error(y_test, y_pred)
r2_reg= r2_score(y_test, y_pred)
print(f'Mean Absolute Error: {mse}')
print(f'R-squared: {r2_reg}')
#to view the actual and predicted values
for actual, predicted in zip(y_test, y_pred):
print(f"Actual: {actual}, Predicted: {int(predicted)}")
Mean Absolute Error: 0.12313124521990503 R-squared: 0.8302825355373844 Actual: 0, Predicted: 0 Actual: 1, Predicted: 1 Actual: 1, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 1, Predicted: 0 Actual: 1, Predicted: 1 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 1 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 1 Actual: 1, Predicted: 0 Actual: 1, Predicted: 0 Actual: 1, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 1 Actual: 0, Predicted: 0 Actual: 1, Predicted: 1 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0
In [87]:
# Visualization of predicted vs actual costs
fig_cost = px.scatter(x=y_test, y=y_pred, labels={'x': 'Actual Cost', 'y': 'Predicted Cost'}, title='Actual vs Predicted Cost')
fig_cost.add_shape(
type="line", line=dict(dash='dash'),
x0=y_test.min(), y0=y_test.min(),
x1=y_test.max(), y1=y_test.max()
)
fig_cost.update_layout(paper_bgcolor="white")
fig_cost.show()
Hyperparameter tuning and cross validation for Gradient Boosting Regressor¶
In [ ]:
#hyperparameter tuning and cross validation for gradientboostingRegressor
gbr = GradientBoostingRegressor(
random_state=42,
n_estimators=200,
max_depth=3,
min_samples_split=2,
min_samples_leaf=2,
learning_rate=0.1,
)
gbr.fit(X_train, y_train)
y_pred_reg = gbr.predict(X_test)
cv_scores_r2_reg = cross_val_score(gbr, Features, Target, cv=5, scoring='r2')
cv_scores_mae = cross_val_score(gbr, Features, Target, cv=5, scoring='neg_mean_absolute_error')
cv_scores_mae = -cv_scores_mae
print("Mean R2 score:", cv_scores_r2_reg.mean())
print("Mean MAE score:", cv_scores_mae.mean())
#to view the actual and predicted values
for actual, predicted in zip(y_test, y_pred_reg):
print(f"Actual: {actual}, Predicted: {int(predicted)}")
Mean R2 score: 0.9926229878124989 Mean MAE score: 348.9119389870635 Actual: 0, Predicted: 0 Actual: 1, Predicted: 1 Actual: 1, Predicted: 0 Actual: 1, Predicted: 1 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 1, Predicted: 0 Actual: 1, Predicted: 1 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 1 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 1 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 1 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 1 Actual: 1, Predicted: 0 Actual: 1, Predicted: 0 Actual: 1, Predicted: 0 Actual: 1, Predicted: 1 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 1 Actual: 0, Predicted: 0 Actual: 1, Predicted: 1 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 1 Actual: 0, Predicted: 0 Actual: 0, Predicted: 0 Actual: 1, Predicted: 0
In [89]:
# Visualization of predicted vs actual costs after hyperparameter tuning
fig_cost = px.scatter(x=y_test, y=y_pred_reg, labels={'x': 'Actual Cost', 'y': 'Predicted Cost'}, title='Actual vs Predicted Cost')
fig_cost.add_shape(
type="line", line=dict(dash='dash'),
x0=y_test.min(), y0=y_test.min(),
x1=y_test.max(), y1=y_test.max()
)
fig_cost.update_layout(paper_bgcolor="white")
fig_cost.show()
In [90]:
#save the model
import joblib
joblib.dump(gbr, 'sales_prediction_model.pkl')
Out[90]:
['sales_prediction_model.pkl']
In [91]:
# for classification of the loyalty member
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
Features_clf = df[['Quantity','Sales', 'ProductCategory', 'PaymentMethod', 'DiscountApplied', 'TotalDiscount', 'CustomerFrequency', 'AverageRatingPerCustomer', 'TotalSalesPerCustomer']]
Target_clf = df['LoyaltyMember']
X_train, X_test, y_train, y_test = train_test_split(Features_clf, Target_clf, test_size=0.2, random_state=42)
model_clf = RandomForestClassifier()
model_clf.fit(X_train, y_train)
y_pred_clf = model_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_clf)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred_clf))
Accuracy: 0.8833333333333333
precision recall f1-score support
0 0.97 0.83 0.89 35
1 0.80 0.96 0.87 25
accuracy 0.88 60
macro avg 0.88 0.89 0.88 60
weighted avg 0.90 0.88 0.88 60
In [92]:
# confusion matrix by plotly
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_clf)
# Create a confusion matrix heatmap
fig = go.Figure(data=go.Heatmap(
z=cm,
x=['Predicted Negative', 'Predicted Positive'],
y=['Actual Negative', 'Actual Positive'],
hoverongaps=False,
colorscale='Viridis'))
fig.update_layout(
title='Confusion Matrix',
xaxis_title='Predicted Label',
yaxis_title='True Label'
)
fig.show()
In [93]:
# random forest classifier
from sklearn.tree import plot_tree
num_estimators = len(model_clf.estimators_)
print(f'The model has {num_estimators} estimators.')
plt.figure(figsize=(20, 10))
plot_tree(model_clf.estimators_[0], filled=True, feature_names=Features_clf.columns, class_names=[str(cls) for cls in model_clf.classes_])
plt.show()
The model has 100 estimators.
Hyperparameter tuning and cross validation for RandomForestClassifier¶
In [94]:
#hyperparameter tuning and cross validation for RandomForestClassifier
rfc = RandomForestClassifier(
random_state=42,
n_estimators=300,
max_depth=3,
min_samples_split=2,
min_samples_leaf=2,
)
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)
cv_scores_rfc = cross_val_score(rfc, Features_clf, Target_clf, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores for Random Forest Classifier:", cv_scores_rfc)
print("Mean accuracy score:", cv_scores_rfc.mean())
Cross-validation accuracy scores for Random Forest Classifier: [0.93333333 0.7 0.73333333 0.78333333 0.8 ] Mean accuracy score: 0.79
In [95]:
#to view the actual and predicted values
for actual, predicted in zip(y_test, y_pred_reg):
print(f"Actual: {actual}, Predicted: {int(predicted)}")
Actual: 0, Predicted: 978 Actual: 1, Predicted: 22402 Actual: 1, Predicted: 2895 Actual: 1, Predicted: 10013 Actual: 0, Predicted: 10082 Actual: 1, Predicted: 7024 Actual: 1, Predicted: 16117 Actual: 1, Predicted: 21932 Actual: 0, Predicted: 7211 Actual: 0, Predicted: 919 Actual: 0, Predicted: 9603 Actual: 1, Predicted: 23343 Actual: 0, Predicted: 2786 Actual: 0, Predicted: 3865 Actual: 0, Predicted: 5421 Actual: 0, Predicted: 4372 Actual: 1, Predicted: 4517 Actual: 0, Predicted: 2125 Actual: 0, Predicted: 2261 Actual: 0, Predicted: 1195 Actual: 1, Predicted: 2196 Actual: 0, Predicted: 11410 Actual: 0, Predicted: 759 Actual: 1, Predicted: 3339 Actual: 0, Predicted: 656 Actual: 1, Predicted: 860 Actual: 0, Predicted: 4539 Actual: 0, Predicted: 5607 Actual: 1, Predicted: 5883 Actual: 0, Predicted: 2907 Actual: 1, Predicted: 14658 Actual: 1, Predicted: 7766 Actual: 1, Predicted: 3138 Actual: 1, Predicted: 1527 Actual: 1, Predicted: 8027 Actual: 0, Predicted: 2877 Actual: 0, Predicted: 3717 Actual: 0, Predicted: 20326 Actual: 0, Predicted: 9513 Actual: 1, Predicted: 3037 Actual: 0, Predicted: 515 Actual: 1, Predicted: 21791 Actual: 0, Predicted: 10407 Actual: 1, Predicted: 2360 Actual: 0, Predicted: 1641 Actual: 0, Predicted: 1439 Actual: 0, Predicted: 706 Actual: 1, Predicted: 3131 Actual: 0, Predicted: 2502 Actual: 1, Predicted: 3677 Actual: 0, Predicted: 13820 Actual: 0, Predicted: 4010 Actual: 1, Predicted: 10658 Actual: 0, Predicted: 8420 Actual: 0, Predicted: 2593 Actual: 0, Predicted: 3445 Actual: 1, Predicted: 1822 Actual: 0, Predicted: 7431 Actual: 0, Predicted: 5725 Actual: 1, Predicted: 2822
In [ ]: